notebook.community

Edit and run



In [1]:

    
## Importing Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score, accuracy_score, auc, roc_curve, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import yellowbrick
from yellowbrick.classifier import ROCAUC

%matplotlib inline
plt.style.use('ggplot')









    



D:\WorkArea\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
df = pd.read_csv('Churn_Modelling.csv', low_memory=False)



In [3]:

    
df.head()









    Out[3]:







  
    
      
      RowNumber
      CustomerId
      Surname
      CreditScore
      Geography
      Gender
      Age
      Tenure
      Balance
      NumOfProducts
      HasCrCard
      IsActiveMember
      EstimatedSalary
      Exited
    
  
  
    
      0
      1
      15634602
      Hargrave
      619
      France
      Female
      42
      2
      0.00
      1
      1
      1
      101348.88
      1
    
    
      1
      2
      15647311
      Hill
      608
      Spain
      Female
      41
      1
      83807.86
      1
      0
      1
      112542.58
      0
    
    
      2
      3
      15619304
      Onio
      502
      France
      Female
      42
      8
      159660.80
      3
      1
      0
      113931.57
      1
    
    
      3
      4
      15701354
      Boni
      699
      France
      Female
      39
      1
      0.00
      2
      0
      0
      93826.63
      0
    
    
      4
      5
      15737888
      Mitchell
      850
      Spain
      Female
      43
      2
      125510.82
      1
      1
      1
      79084.10
      0



In [4]:

    
df.info() ## seems like no missing values









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB

Data Analysis and Exploration



In [5]:

    
plt.figure(figsize=(15,7))
sns.distplot(df['CreditScore'])
plt.title('Distribution of the Credit Score')
plt.show()

## Almost a Normal Distribution



In [6]:

    
## Logistic regression with just the the credit score and the Target
X = df[['CreditScore']]
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 25)
clr = LogisticRegression()

clr.fit(X_train, y_train)
clr.predict(X_test)

print("Accuracy of the model: {}".format(accuracy_score(y_test, clr.predict(X_test))))
print("10-fold cross validation accuracy of the model: {}".format(cross_val_score(clr,X_train, y_train, cv=10).mean()))

## We are able to get pretty good accuracy with just the credit scores









    



Accuracy of the model: 0.8068
10-fold cross validation accuracy of the model: 0.792800416237777



In [7]:

    
roc_auc_score(y_test, clr.predict(X_test)) ## Really bad









    Out[7]:





0.5



In [8]:

    
plt.figure(figsize=(10,7))
probs = clr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')









    Out[8]:





<matplotlib.text.Text at 0x230fe9ed198>

Lets continue and see what we discover that can improve the model



In [9]:

    
plt.figure(figsize=(12,7))
plt.title('Geography Count Plot')
df['Geography'].value_counts().plot(kind='barh')

## Half the dataset is france









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x230fe6a37b8>



In [10]:

    
plt.figure(figsize=(10,7))
plt.title('Gender Count Plot')
df['Gender'].value_counts().plot(kind='barh', color='green')

## Almost balanced









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x230fe83b048>



In [11]:

    
## Question to answer how many are from france and have excited?

for i in df['Geography'].unique():
    print(i)
    print(20 * '--')
    print(len(df[(df['Geography'] == i)]))
    print(len(df[(df['Geography'] == i) & (df['Exited'] == 1)]) / len(df[(df['Geography'] == i)]))
    print('\n')

## We can see that in Germany about 1/3 of the people exited









    



France
----------------------------------------
5014
0.16154766653370561


Spain
----------------------------------------
2477
0.1667339523617279


Germany
----------------------------------------
2509
0.32443204463929853



In [12]:

    
plt.figure(figsize=(10,7))
sns.countplot(df['Geography'], hue=df['Exited'])









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x230fe8727f0>



In [13]:

    
plt.figure(figsize=(10,7))
sns.countplot(df['Geography'], hue=df['Gender'])









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x230fe24c9b0>



In [14]:

    
plt.figure(figsize=(10,7))
sns.kdeplot(df['Age'])

## Again almost a normal distribution!









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x230fed7d048>



In [15]:

    
plt.figure(figsize=(10,7))
sns.kdeplot(df['EstimatedSalary'])









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x230feebfb70>



In [16]:

    
df['Tenure'].describe()









    Out[16]:





count    10000.000000
mean         5.012800
std          2.892174
min          0.000000
25%          3.000000
50%          5.000000
75%          7.000000
max         10.000000
Name: Tenure, dtype: float64



In [17]:

    
## Lets look at that turn out the be important features



In [18]:

    
rfe = RFE(RandomForestClassifier(n_estimators=1000), n_features_to_select=5)



In [19]:

    
X = df.iloc[:,3:-1]
y = df['Exited']

## Encode the categorical variables
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 25)



In [20]:

    
rfe.fit(X_train,y_train)









    Out[20]:





RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
  n_features_to_select=5, step=1, verbose=0)



In [21]:

    
rfe.ranking_









    Out[21]:





array([1, 1, 2, 1, 1, 6, 3, 1, 4, 7, 5])



In [22]:

    
rfe.predict(X_test)









    Out[22]:





array([0, 0, 0, ..., 0, 0, 0], dtype=int64)



In [23]:

    
roc_auc_score(y_test, rfe.predict(X_test)) ## Slightly better









    Out[23]:





0.67290350858284287



In [24]:

    
plt.figure(figsize=(10,7))
probs = rfe.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, rfe.predict(X_test))
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')









    Out[24]:





<matplotlib.text.Text at 0x230fe459278>



In [25]:

    
rf = RandomForestClassifier(n_estimators=1000)



In [26]:

    
rf.fit(X_train, y_train)









    Out[26]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [27]:

    
rf.predict(X_test)









    Out[27]:





array([0, 0, 0, ..., 0, 0, 0], dtype=int64)



In [28]:

    
roc_auc_score(y_test, rf.predict(X_test))









    Out[28]:





0.71192175001103453



In [29]:

    
plt.figure(figsize=(10,7))
probs = rfe.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, rf.predict(X_test))
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')









    Out[29]:





<matplotlib.text.Text at 0x230feadfcc0>



In [30]:

    
for i , j in zip(rf.feature_importances_, X.columns):
    print(j,i)









    



CreditScore 0.142376738965
Age 0.241170482945
Tenure 0.0811831909984
Balance 0.140205537828
NumOfProducts 0.131425657517
HasCrCard 0.0187316121858
IsActiveMember 0.0398156004091
EstimatedSalary 0.14419385219
Geography_Germany 0.0277415521433
Geography_Spain 0.0142899275729
Gender_Male 0.0188658472451



In [31]:

    
rfe.ranking_









    Out[31]:





array([1, 1, 2, 1, 1, 6, 3, 1, 4, 7, 5])



In [32]:

    
## Okay good!

## Lets use Random Forest and build further, lets also look at the other metrics and see how the rf is doing



In [33]:

    
y_pred = rf.predict(X_test)



In [34]:

    
print("Accuracy of the model: {}".format(accuracy_score(y_test, y_pred)))
print("10-fold cross validation accuracy of the model: {}".format(cross_val_score(clr,X_train, y_train, cv=10).mean()))









    



Accuracy of the model: 0.868
10-fold cross validation accuracy of the model: 0.7850651152565012



In [35]:

    
confusion_matrix(y_test, y_pred)









    Out[35]:





array([[1949,   68],
       [ 262,  221]], dtype=int64)



In [36]:

    
print(classification_report(y_test, y_pred))









    



             precision    recall  f1-score   support

          0       0.88      0.97      0.92      2017
          1       0.76      0.46      0.57       483

avg / total       0.86      0.87      0.85      2500



In [37]:

    
## Ends here!!

	RowNumber	CustomerId	Surname	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
0	1	15634602	Hargrave	619	France	Female	42	2	0.00	1	1	1	101348.88	1
1	2	15647311	Hill	608	Spain	Female	41	1	83807.86	1	0	1	112542.58	0
2	3	15619304	Onio	502	France	Female	42	8	159660.80	3	1	0	113931.57	1
3	4	15701354	Boni	699	France	Female	39	1	0.00	2	0	0	93826.63	0
4	5	15737888	Mitchell	850	Spain	Female	43	2	125510.82	1	1	1	79084.10	0